home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Magnum One
/
Magnum One (Mid-American Digital) (Disc Manufacturing).iso
/
d18
/
kruse_11.arc
/
11TEXT.EXT
< prev
next >
Wrap
Text File
|
1990-11-30
|
27KB
|
761 lines
{11.2 Structuring the Data: The Main Program}
Program IndexText(InText, InIndex, NewIndex, OutIndex, HashFile,
NewHashFile, Input, Output);
{Produces word counts and list of references for the document file InText.
Uses the master word list in file InIndex, if provided.
Output word list for the new text goes to file NewIndex.
The merger of these two files becomes OutIndex.
HashFile contains the common words to be ignored. If not specified, it is
created on output, containing the words so flagged by the user.}
Const
maxwd = 20; {More letters in a word will be ignored.}
minwd = 3; {Shorter words will be ignored}
hashsize = 2003; {should be a prime; size of hash table}
linesperpage = 66; {assumes standard spacing and paper}
maxheight = 20; {for building binary tree in phase 2}
A = 'A';
Z = 'Z';
hyphen = '-';
blank = ' ';
apostrophe = ''''; {requires two apostrophes to represent one}
underscore = '_';
ordbackspace = 8; {ASCII control character for backspace}
ordformfeed = 12; {ASCII control character for new page}
changecase = 32; {ASCII difference between upper and lower case}
Type
word = packed array[1..maxwd] of char;
reference = record
wd: word;
pg: integer; {page number}
end;
fileref = file of reference; {used for local files}
letter = A..Z;
hashentry = 1..hashsize;
Var
InText, {document being processed}
InIndex, {master word list}
NewIndex, {word list of current document}
OutIndex: text; {updated master word list}
HashFile,
NewHashFile: file of word; {local file, used to update HashFile}
RefFile: array[letter] of fileref; {local files used for
auxiliary storage of words from phase 1 to phase 2:
separate file for each initial letter}
blankword: word; {will contain all blanks}
outcount: array[letter] of integer; {counters for word files}
wordcount: integer; {count of all words in the text}
Begin {main program}
SplitWords; {phase 1}
ClassifyWords; {phase 2}
UpdateHashFile; {phase 3, first part}
MergeIndices; {phase 3, second part}
End.
Function Lt(u,v: word): Boolean;
{Determine if word u precedes word v lexicographically.}
Var
i: 1..maxwd; {loop variable}
Begin {function Lt}
i := 1;
While (i < maxwd) and (u[i] = v[i]) do i := i + 1;
Lt := (u[i] < v[i])
{Above is version that works with ASCII code. For codes where blank comes
after letters, modifications are necessary.}
End; {function Lt}
Procedure ReadWord( var F: text; var w: word);
{reads word w from text file F; assumes not at end of file}
Var
c: 1..maxwd;
Begin {procedure ReadWord}
For c := 1 to maxwd do
read(F, w[c])
End; {procedure ReadWord}
procedure WriteWord(var F: text; w: word);
{writes word w to text file F}
var
c: 1..maxwd;
begin
for c := 1 to maxwd do
write(F, w[c])
end;
{11.3 Phase 1: Splitting the Text into Words}
Procedure SplitWords;
{sets up hash table, reads text, and divides into 26 word lists}
Var
hash: array[hashentry] of word; {hash table}
pagecount, {keeps the current page number}
addpage, {amount to increase pagecount after word}
linecount: integer; {line number on the current page}
w: word; {word currently being processed}
x: hashentry; {location of w, if in hash table}
endinput: Boolean; {true if and only if input has all been read}
firstletter: char; {Into which file does word w go?}
{The following are kept for use in procedure GetWord,
and for efficiency are set up only once in procedure Initialize.}
backspace,
formfeed: char; {ASCII control characters}
contchar, {characters OK in the middle of a word}
alphabet: set of char; {letters only --- to start a word}
{Implementation dependent: A good Pascal compiler should allow "set of char";
otherwise, a restricted range is required.}
Begin {procedure SplitWords}
Initialize; {sets up files, hash table, constants}
GetWord(w); {obtains a single word from InText}
While not endinput do
Begin
x := HashAddress(w);
If w <> hash[x] then
Begin {Not in hash table; put into RefFile.}
firstletter := w[1];
outcount[firstletter] := outcount[firstletter] + 1;
With RefFile[firstletter]^ do
Begin
wd := w;
pg := pagecount
End;
Put(RefFile[firstletter])
End;
GetWord(w)
End;
Conclude {writes word counts to Output}
End; {procedure SplitWords}
Function HashAddress(w: word): hashentry;
{calculates the location in hash table of word w, or, if none,
returns pointing to the blank word where w should go}
Var
x, {calculated location}
inc: integer; {increment for open addressing}
Begin {function HashAddress}
x := (ord(w[1]) * ord(w[3]) * ord(w[4]) + ord(w[6])) mod hashsize + 1;
{Hash function assumes long word length. For short word machines, we
must ensure that the result is nonnegative, and worry about overflow.}
If (hash[x] <> w) and (hash[x] <> blankword) then
Begin
inc := 1;
Repeat
x := x + inc;
If x > hashsize then x := x - hashsize;
inc := inc + 2
Until (w = hash[x]) or (blankword = hash[x])
End;
HashAddress := x
End; {function HashAddress}
Procedure Initialize;
{sets up constant-valued sets for use in GetWord;
opens the text file and initializes various counters;
opens file holding hash table (if any), and reads or
otherwise initializes the table.}
Var
ch: char; {used as an index}
i: integer; {general--purpose loop control}
Begin {procedure Initialize}
backspace:= chr(ordbackspace);
formfeed := chr(ordformfeed); {Initialize ASCII control characters.}
alphabet := ['A'..'Z', 'a'..'z']; {letters only, to start a word}
contchar := alphabet + [hyphen, apostrophe, backspace, underscore];
{characters that will not terminate the word}
For i := 1 to maxwd do
blankword[i] := blank;
reset(InText);
endinput := eof(InText);
Repeat
write( 'What is the page number on which the text begins?');
readln(pagecount);
if pagecount < 0 then
writeln('Must be a nonnegative integer.')
until pagecount >= 0;
linecount := 0;
addpage := 0;
wordcount := 0;
For ch := A to Z do
Begin
Rewrite( RefFile[ch] );
Outcount[ch] := 0
End;
reset(HashFile);
if eof(HashFile) then
begin {There is no previous table; initialize the table to all blanks.}
writeln('Cannot open file for hash table. Creating a new table.');
for i := 1 to hashsize do
hash[i] := blankword
end
else begin {Retrieve the previous hash table.}
i := 0;
repeat
i := i + 1;
hash[i] := HashFile^;
get(HashFile)
until eof(HashFile) or (i >= hashsize);
if (not eof(HashFile)) or (i <> hashsize) then
writeln('Error in reading hash table. Incorrect number of entries.')
end
end; {procedure Initialize}
Procedure GetWord( var w: word);
{Gets words from input file InText, and returns only words
at least minwd characters long. Parameter endinput becomes
true if and only if the end of InText is reached with no word to return.
This parameter is set by the subsidiary procedure GetChar.
The procedure also updates global variables wordcount and linecount,
updates the global variable pagecount after each linesperpage cr's,
or after each formfeed, whichever comes first, and
uses the sets alphabet and contchar and various character constants.}
label 1; {used by GetChar to exit procedure on eof(InText)}
Var c: 0..maxwd; {count of characters in word}
ch: char; {character currently processed}
endln: Boolean; {At the end of a line?}
begin {procedure GetWord}
repeat {until current word is at least minwd chars long}
c := 0;
repeat
GetChar(ch) {Find a letter that will start the word.}
until ch in alphabet;
pagecount := pagecount + addpage;
addpage := 0;
If ch in ['a'..'z'] then {Translate the first letter to uppercase.}
ch := chr(ord(ch) - changecase); {system dependent}
AddChar(ch); {Put first letter into the word.}
GetChar(ch);
While ch in contchar do
If ch in alphabet then {Add letters directly to word.}
Begin {processing letter}
AddChar(ch);
GetChar(ch)
End {processing letter}
Else If ch = hyphen then
Begin {processing hyphen}
GetChar(ch); {Find what comes after hyphen.}
If endln then
GetChar(ch) {Delete both the hyphen and the end of line.}
Else if ch = hyphen then {Two hyphens represent a dash.}
ch := blank {Use a blank to terminate the word.}
Else If ch in alphabet then
AddChar(hyphen) {Include hyphens between letters}
Else {nothing} {Delete all other hyphens}
End {processing hyphen}
Else if ch = apostrophe then
Begin {processing apostrophe}
GetChar(ch);
If ch = 's' then {Delete 's at end of word only.}
Begin
GetChar(ch);
If ch in contchar then
Begin
AddChar(apostrophe);
AddChar('s')
End
End
Else if ch in alphabet then
AddChar(apostrophe) {Allow contractions.}
End {processing apostrophe}
Else {Remaining possibilities are backspace and underscore.}
GetChar(ch); {Delete these characters.}
{while loop on continuing characters ends here.}
wordcount := wordcount + 1
Until c >= minwd; {Skip over short words.}
While c < maxwd do {Fill with blanks.}
Begin
c := c + 1;
w[c] := blank
End;
1: {When end of file occurs, program will exit to here from GetChar.}
End; {procedure GetWord}
Procedure GetChar(var ch: char);
{gets a character from input text into ch; checks for eof;
updates page count and line count}
Begin {procedure GetChar}
If eof(InText) then
If c >= minwd then
ch := '.' {special character to end the current word}
Else begin {no word to return; set endinput}
endinput := true;
goto 1 {Exit from GetWord.}
End
Else begin {not at end of file: process next character}
ch := InText^;
endln := eoln(InText);
get(InText);
If endln then
Begin
linecount := linecount + 1;
If linecount >= linesperpage then
Begin
addpage := addpage + 1;
linecount := 0
End
End;
If ch = formfeed then
Begin
addpage := addpage + 1;
linecount := 0;
endln := true; {Treat formfeed like end of line.}
ch := blank
End
End
End; {procedure GetChar}
Procedure AddChar(ch: char);
{adds given character to word, if possible}
Begin {procedure AddChar}
If c < maxwd then
Begin
c := c + 1;
w[c] := ch
End
End; {procedure AddChar}
Procedure Conclude;
{Writes out counts of various word lists. For some systems, it is
necessary to close files, which should be done in this procedure.}
Var
ch: char; {loop index}
Begin {procedure Conclude}
writeln('The total number of words read in is ', wordcount:7);
writeln;
writeln('The number of words to process further in the next stage,');
writeln('beginning with each letter, is below.');
writeln;
for ch := 'A' to 'M' do write(' ', ch:1, ' ');
writeln;
for ch := 'A' to 'M' do write(outcount[ch]:4, ' ');
writeln;
writeln;
for ch := 'N' to 'Z' do write(' ', ch:1, ' ');
writeln;
for ch := 'N' to 'Z' do write(outcount[ch]:4, ' ');
writeln;
writeln
End; {procedure Conclude}
{11.4 Phase 2: Classifying the words}
Procedure ClassifyWords;
{For each letter of the alphabet, the procedure reads in a list of words from
InIndex, builds them into a binary tree, supplements it with entries from
RefFile, and writes result to NewIndex and NewHashFile.}
Type
wordtype = (hash, count, index); {three ways to process a word}
pointref = ^reflist;
reflist = record {list of references}
pg: integer;
next: pointref
end;
pointer = ^node;
node = record {vertex of the binary tree}
wd: word;
left,
right: pointer;
case kind: wordtype of
hash:
(); {empty}
count:
(ct: integer);
index:
(ref: pointref)
end;
Var
root: pointer; {root of the binary tree}
ch: char; {Loop on the first letter of word.}
Begin {procedure ClassifyWords}
writeln('At the appearance of each word, give its disposition:');
writeln(' F --- Forget all occurrences of this word.');
writeln(' C --- Count how many times this word appears.');
writeln(' I --- Index this word: list the pages on which it appears.');
Reset(InIndex);
Rewrite(NewIndex);
For ch := A to Z do {Start main loop on first letter of word.}
Begin
BuildTree(root, ch); {Get the part of master wordlist starting with ch
from the file InIndex, and build it into a binary tree.}
reset(RefFile[ch]);
While not eof(RefFile[ch]) do
Begin
Process(RefFile[ch]^);
{Use new words from RefFile[ch] to update the tree.}
get( RefFile[ch] )
End;
OutputTree(root) {Write the contents of the tree into files NewIndex and
NewHashFile.}
End {main loop on letters of alphabet}
End; {procedure ClassifyWords}
Procedure GetNode( var p: pointer; ch: char);
{reads a word from file InIndex and sets node correspondingly;
returns p = nil at eof or when next word starts later than ch}
Var
wordcode: char; {letter indicating type of word}
Begin {procedure GetNode}
While (not eof(InIndex)) and (InIndex^ = blank) do
Get(InIndex); {Skip all the leading blanks.}
If eof(InIndex) then
p := nil
Else if InIndex^ > ch then
p := nil
Else begin
new(p);
with p^ do begin
ReadWord(InIndex, wd);
Read(InIndex, wordcode);
If wordcode = 'i'
then begin kind := index; ref := nil end
Else if wordcode = 'c'
then begin kind := count; ct := 0 end
Else
Writeln('Erroneous word code in file InIndex.')
End; {with statement setting up node}
readln(InIndex) {Advance to the start of the next entry.}
End
End; {procedure GetNode}
Procedure Process( r: reference);
{takes the word and page reference r and updates the binary tree}
Var
p: pointer; {Trace through the tree.}
found: Boolean; {Is the word in the tree?}
Begin {procedure Process}
If root = nil then {The tree might be empty.}
NewWord(root, r)
Else begin {case of nonempty tree}
p := root; {Begin a tree search.}
found := false;
Repeat
If r.wd = p^.wd then
found := true
Else If Lt(r.wd, p^.wd) then
p := p^.left
Else
p := p^.right
Until found or (p = nil);
If found then UpdateNode(p, r)
Else begin {p^ was not found: add it to the tree.}
NewWord(p, r);
InsertTree(root, p)
End
End
End; {procedure Process}
Procedure UpdateNode( p: pointer; r: reference);
{uses reference r to update information in node p^}
Var
q: pointref; {used to add reference to list}
Begin {procedure UpdateNode}
With p^ do
Case kind of
hash:; {no action needed}
count: ct := ct + 1;
index: If ref = nil then
Begin
new(ref);
ref^.pg := r.pg;
ref^.next := nil
End
Else if ref^.pg <> r.pg then
Begin {Add the new reference to the list.}
New(q);
q^.pg := r.pg;
q^.next := ref;
ref := q
End
End {case statement to update tree}
End; {procedure UpdateNode}
Procedure NewWord(var p: pointer; r: reference);
{Creates a node for the first occurrence of a new reference r.
A pointer to the new node is returned in p.}
Var
response: char; {answer received from user}
Begin {procedure NewWord}
new(p);
With p^ do
Begin
wd := r.wd;
left := nil;
right := nil;
Repeat {Ask user what kind of word.}
WriteWord(output, wd);
write('is (F, C, I)?');
read(response)
Until response in ['F', 'C', 'I' ,'f', 'c', 'i'];
Case response of
'F','f': kind := hash;
'C','c': Begin
kind := count;
ct := 1
End;
'I','i': Begin
kind := index;
new(ref);
ref^.pg := r.pg;
ref^.next := nil;
End
End {case statement}
End {with statement}
End; {procedure NewWord}
Procedure InsertTree(r, p: pointer);
{adds a node p^ to the tree with root r^; requires that r <> nil
and p^ not be in the tree; proceeds by recursion}
Begin {procedure InsertTree}
If Lt(p^.wd, r^.wd) then
If r^.left = nil then r^.left := p
Else InsertTree(r^.left, p)
Else
If r^.right = nil then r^.right := p
Else InsertTree(r^.right, p)
End; {procedure InsertTree}
Procedure OutputTree( p: pointer);
{traverses the tree for which p^ is the root in inorder}
Begin {procedure OutputTree}
If p <> nil then
With p^ do
Begin
OutputTree(left); {Traverse the left subtree.}
PutNode(p);
OutputTree(right); {Traverse the right subtree.}
Dispose(p)
End
End; {procedure OutputTree}
Procedure PutNode(p: pointer);
Var
q: pointref; {used to traverse list of references}
Begin {procedure PutNode}
With p^ do
Case kind of
hash: Begin
NewHashFile^ := wd;
put( NewHashFile )
End;
count: If ct <> 0 then {Otherwise, word is not in the document.}
Begin
WriteWord(NewIndex, wd);
write(NewIndex, 'c');
writeln( NewIndex, ct:5)
End;
index: If ref <> nil then
Begin
WriteWord(NewIndex, wd);
write(NewIndex, 'i');
q := ref;
Repeat
write( NewIndex, q^.pg:5);
q := q^.next
Until q = nil;
writeln( NewIndex )
End
End {case statement}
End; {procedure PutNode}
{11.5 Phase 3: Updating the Permanent Files}
Procedure UpdateHashFile;
{reads in old hash table, inserts file of new entries; writes out to HashFile}
Var
hash: array[hashentry] of word;
x: hashentry;
w: word;
Begin {procedure UpdateHashFile}
reset(HashFile);
If eof(HashFile) then {HashFile is empty; create new table.}
For x := 1 to hashsize do
hash[x] := blankword
Else
For x := 1 to hashsize do
read(HashFile, hash[x]);
{Some versions of Pascal do not allow procedures read and write for
files other than text. For such systems, expand to use get and put.}
reset(NewHashFile);
While not eof(NewHashFile) do
Begin
read(NewHashFile, w);
hash[HashAddress(w)] := w
{If the table is full, new entries will replace old ones.}
End;
rewrite(HashFile);
For x := 1 to hashsize do
Write(HashFile, hash[x])
End; {procedure UpdateHashFile}
Procedure MergeIndices;
{merges files NewIndex and InIndex into file OutIndex}
Var
u, v: word; {for new and old indices, respectively}
m, n: integer; {counts for above entries}
ukind,
vkind: char; {Is the word of kind i or c?}
Begin {procedure MergeIndices}
reset(NewIndex);
reset(InIndex);
rewrite(OutIndex);
If eof(NewIndex) or eof(InIndex) then
writeln('One of the indices is empty. No merge will be done.')
Else Begin
ReadWord(NewIndex, u);
ReadWord( InIndex, v);
Repeat
If Lt(u,v) then
CopyLine(u, NewIndex, true, true)
{Boolean parameters mean, respectively; start new line; end the line.}
Else If Lt(v,u) then
CopyLine(v, InIndex, true, true)
Else begin {Words are equal. Determine the kind of word.}
read(NewIndex, ukind);
read( InIndex, vkind);
If ukind <> vkind then
writeln('Inconsistent word types found in merge.');
WriteWord(OutIndex, u);
write(OutIndex, ukind);
If ukind = 'c' then
Begin
readln(NewIndex, m);
readln( InIndex, n);
m := m + n;
writeln(OutIndex, m:5);
If not eof(NewIndex) then ReadWord(NewIndex, u);
If not eof( InIndex) then ReadWord( InIndex, v)
End
Else begin {Copy both lists of page numbers.}
CopyLine(u, NewIndex, false, false);
CopyLine(v, InIndex, false, true)
End
End {finished processing equal words}
Until eof(NewIndex) or eof(InIndex);
While not eof(NewIndex) do
CopyLine(u, NewIndex, true, true);
While not eof(InIndex) do
CopyLine(v, InIndex, true, true)
{At most one of the two loops above will iterate.}
End
End; {procedure MergeIndices}
Procedure CopyLine( var w: word; var F: text; newline, endline: Boolean);
{Copies the remainder of a line from the file F to OutIndex.
If newline is true, then the word w is also written, and kind is copied.
If endline is true, then the line written to OutIndex is ended.
The procedure also reads a new word w from the next line in F.}
Var
n: integer; {number copied from file to file}
kind: char; {word code copied from file to file}
Begin {procedure CopyLine}
If newline then
Begin
WriteWord(OutIndex, w);
read(F, kind);
write(OutIndex, kind)
End Else
While (not eof(F)) and (not eoln(F)) and (F^ = blank) do
get(F);
While (not eof(F)) and (not eoln(F)) do
Begin
read(F, n);
write(OutIndex, n:5);
While (not eoln(F)) and (F^ = blank) do
get(F); {Skip blanks.}
End;
readln(F);
If not eof(F) then
ReadWord(F, w);
If endline then writeln(OutIndex)
End; {procedure CopyLine}
{
1. Program IndexText(InText, InIndex, NewIndex, OutIndex,
HashFile, Input, Output);
2. Function Lt(u, v: word): Boolean;
3. Procedure ReadWord(var F: text; var w: word);
4. Procedure WriteWord(var F: text; w: word);
5. Procedure SplitWords; phase 1
6. Function HashAddress(w: word): hashentry;
7. Procedure Initialize;
8. Procedure GetWord;
9. Procedure GetChar(var ch: char);
10. Procedure AddChar(ch: char);
11. Procedure Conclude;
12. Procedure ClassifyWords; phase 2
13. Procedure BuildTree(var root: pointer; ch: char);
14. Procedure Insert(p: pointer);
15. Function Power2(c: integer): level;
16. Procedure FindRoot;
17. Procedure ConnectSubtrees;
18. Procedure GetNode(var p: pointer; ch: char);
19. Procedure Process(r: reference);
20. Procedure UpdateNode(p: pointer; r: reference);
21. Procedure NewWord(var p: pointer; r: reference);
22. Procedure InsertTree(r, p: pointer);
23. Procedure OutputTree(p: pointer);
24. Procedure PutNode(p: pointer);
25. Procedure UpdateHashFile; phase 3
26. Function HashAddress(w: word): hashentry;
27. Procedure MergeIndices;
28. Procedure CopyLine
}